{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TensorRT Text2Image Stable Diffusion Pipeline\n",
    "\n",
    "The TensorRT Pipeline can be used to accelerate the Text2Image Stable Diffusion Inference run.\n",
    "\n",
    "NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes. This script was contributed by [Asfiya Baig](https://github.com/asfiyab-nvidia) and the notebook by [Parag Ekbote](https://github.com/ParagEkbote)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: polygraphy in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (0.49.20)\n",
      "Requirement already satisfied: onnx in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (1.17.0)\n",
      "Requirement already satisfied: cuda-python in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (12.8.0)\n",
      "Requirement already satisfied: onnx-graphsurgeon in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (0.5.6)\n",
      "Requirement already satisfied: tensorrt in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (10.9.0.34)\n",
      "Requirement already satisfied: onnxruntime-gpu in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (1.21.0)\n",
      "Collecting colored\n",
      "  Downloading colored-2.3.0-py3-none-any.whl.metadata (3.6 kB)\n",
      "Requirement already satisfied: numpy>=1.20 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnx) (1.26.4)\n",
      "Requirement already satisfied: protobuf>=3.20.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnx) (6.30.1)\n",
      "Requirement already satisfied: cuda-bindings~=12.8.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from cuda-python) (12.8.0)\n",
      "Requirement already satisfied: tensorrt_cu12==10.9.0.34 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from tensorrt) (10.9.0.34)\n",
      "Requirement already satisfied: tensorrt_cu12_libs==10.9.0.34 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from tensorrt_cu12==10.9.0.34->tensorrt) (10.9.0.34)\n",
      "Requirement already satisfied: tensorrt_cu12_bindings==10.9.0.34 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from tensorrt_cu12==10.9.0.34->tensorrt) (10.9.0.34)\n",
      "Requirement already satisfied: nvidia-cuda-runtime-cu12 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from tensorrt_cu12_libs==10.9.0.34->tensorrt_cu12==10.9.0.34->tensorrt) (12.4.127)\n",
      "Requirement already satisfied: coloredlogs in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime-gpu) (15.0.1)\n",
      "Requirement already satisfied: flatbuffers in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime-gpu) (25.2.10)\n",
      "Requirement already satisfied: packaging in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime-gpu) (24.2)\n",
      "Requirement already satisfied: sympy in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime-gpu) (1.13.1)\n",
      "Requirement already satisfied: humanfriendly>=9.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from coloredlogs->onnxruntime-gpu) (10.0)\n",
      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from sympy->onnxruntime-gpu) (1.3.0)\n",
      "Downloading colored-2.3.0-py3-none-any.whl (18 kB)\n",
      "Installing collected packages: colored\n",
      "Successfully installed colored-2.3.0\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip install polygraphy onnx cuda-python onnx-graphsurgeon tensorrt onnxruntime-gpu colored"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e388ba8b82364f27be70166a7d74439c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6295b4e5c12346cca961ac9f4b67363b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 28 files:   0%|          | 0/28 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running inference on device: cuda:0\n",
      "Building Engines...\n",
      "Engine build can take a while to complete\n",
      "Building Engines...\n",
      "Engine build can take a while to complete\n",
      "Building TensorRT engine for /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/onnx/unet.opt.onnx: /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/unet.plan\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.\n",
      "[I] Configuring with profiles:[\n",
      "        Profile 0:\n",
      "            {sample [min=(2, 4, 96, 96), opt=(2, 4, 96, 96), max=(8, 4, 96, 96)],\n",
      "             encoder_hidden_states [min=(2, 77, 1024), opt=(2, 77, 1024), max=(8, 77, 1024)],\n",
      "             timestep [min=[1], opt=[1], max=[1]]}\n",
      "    ]\n",
      "[I] Loading tactic timing cache from /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/timing_cache\n",
      "\u001b[38;5;11m[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.\u001b[0m\n",
      "\u001b[38;5;14m[I] Building engine with configuration:\n",
      "    Flags                  | [FP16]\n",
      "    Engine Capability      | EngineCapability.STANDARD\n",
      "    Memory Pools           | [WORKSPACE: 22699.88 MiB, TACTIC_DRAM: 22699.88 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]\n",
      "    Tactic Sources         | []\n",
      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
      "    Preview Features       | [PROFILE_SHARING_0806]\u001b[0m\n",
      "\u001b[38;5;11m[W] UNSUPPORTED_STATE: Skipping tactic 0 due to insufficient memory on requested size of 27843792896 detected for tactic 0x0000000000000000.\u001b[0m\n",
      "\u001b[38;5;10m[I] Finished engine building in 254.971 seconds\u001b[0m\n",
      "[I] Saving tactic timing cache to /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/timing_cache\n",
      "[I] Saving engine to /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/unet.plan\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building TensorRT engine for /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/onnx/vae.opt.onnx: /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/vae.plan\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.\n",
      "[I] Configuring with profiles:[\n",
      "        Profile 0:\n",
      "            {latent [min=(1, 4, 96, 96), opt=(1, 4, 96, 96), max=(4, 4, 96, 96)]}\n",
      "    ]\n",
      "[I] Loading tactic timing cache from /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/timing_cache\n",
      "\u001b[38;5;14m[I] Building engine with configuration:\n",
      "    Flags                  | [FP16]\n",
      "    Engine Capability      | EngineCapability.STANDARD\n",
      "    Memory Pools           | [WORKSPACE: 22699.88 MiB, TACTIC_DRAM: 22699.88 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]\n",
      "    Tactic Sources         | []\n",
      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
      "    Preview Features       | [PROFILE_SHARING_0806]\u001b[0m\n",
      "\u001b[38;5;10m[I] Finished engine building in 172.855 seconds\u001b[0m\n",
      "[I] Saving tactic timing cache to /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/timing_cache\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading TensorRT engine: /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/clip.plan\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[I] Saving engine to /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/vae.plan\n",
      "[I] Loading bytes from /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/clip.plan\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading TensorRT engine: /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/unet.plan\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[I] Loading bytes from /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/unet.plan\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading TensorRT engine: /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/vae.plan\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[I] Loading bytes from /home/zeus/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/engine/vae.plan\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from diffusers import DDIMScheduler\n",
    "from diffusers.pipelines import DiffusionPipeline\n",
    "\n",
    "# Use the DDIMScheduler scheduler here instead\n",
    "scheduler = DDIMScheduler.from_pretrained(\"stabilityai/stable-diffusion-2-1\", subfolder=\"scheduler\")\n",
    "\n",
    "pipe = DiffusionPipeline.from_pretrained(\"stabilityai/stable-diffusion-2-1\",\n",
    "    custom_pipeline=\"stable_diffusion_tensorrt_txt2img\",\n",
    "    variant='fp16',\n",
    "    torch_dtype=torch.float16,\n",
    "    scheduler=scheduler,)\n",
    "\n",
    "# re-use cached folder to save ONNX models and TensorRT Engines\n",
    "pipe.set_cached_folder(\"stabilityai/stable-diffusion-2-1\", variant='fp16',)\n",
    "\n",
    "pipe = pipe.to(\"cuda\")\n",
    "\n",
    "prompt = \"a beautiful photograph of Mt. Fuji during cherry blossom\"\n",
    "image = pipe(prompt).images[0]\n",
    "image.save('tensorrt_mt_fuji.png')"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
